Complex Sampling in National Surveys
Pusat Penyelidikan Penyakit Tak Berjangkit, Institut Kesihatan Umum
Sunday, 16 November 2025
pacman::p_load(tidyverse, arrow)
pyr_df <- read_parquet("https://storage.dosm.gov.my/population/population_malaysia.parquet") %>%
filter(date == as.Date("2025-01-01"), sex %in% c("male", "female"),
age != "overall", ethnicity == "overall") %>%
mutate(pop_k = population, pop = if_else(sex == "male", -pop_k, pop_k),
age0 = readr::parse_number(age), age = fct_reorder(age, age0))
my_pyr_plot <- ggplot(pyr_df, aes(x = age, y = pop, fill = sex)) +
geom_col(width = 0.9) + coord_flip() +
scale_y_continuous(limits = c(-2000, 2000), breaks = seq(-2000, 2000, 500),
labels = function(x) scales::comma(abs(x)),
expand = expansion(mult = c(0.02, 0.02))) +
labs(title = "Malaysia Population Pyramid, 2025", x = "Age group (years)",
y = "Population (thousands)", fill = "Sex") +
theme_minimal(base_size = 13) + theme(panel.grid.minor = element_blank())
my_pyr_plot| Category | Overall % | 95% CI | Male % | 95% CI | Female % | 95% CI |
|---|---|---|---|---|---|---|
| Malaysia | 15.6 | 14.4–16.9 | 15.0 | 13.6–16.5 | 16.2 | 14.7–18.0 |
| Age Group | ||||||
| 18–29 | 3.2 | 2.2–4.6 | 3.7 | 2.2–6.1 | 2.6 | 1.7–4.1 |
| 30–39 | 6.5 | 5.2–8.1 | 6.9 | 5.0–9.3 | 6.0 | 4.5–7.9 |
| 40–49 | 15.2 | 13.2–17.4 | 13.7 | 11.1–16.8 | 16.8 | 14.2–19.8 |
| 50–59 | 28.8 | 25.0–33.0 | 28.4 | 24.2–33.0 | 29.3 | 24.4–34.7 |
| 60+ | 38.0 | 35.4–40.7 | 37.7 | 34.0–41.5 | 38.4 | 35.0–41.8 |
| Ethnicity | ||||||
| Malay | 16.2 | 15.1–17.4 | 15.5 | 14.1–17.1 | 16.9 | 15.4–18.4 |
| Chinese | 15.1 | 11.6–19.5 | 14.8 | 11.2–19.3 | 15.5 | 11.0–21.3 |
| Indian | 26.4 | 22.1–31.2 | 28.4 | 22.1–35.7 | 24.5 | 19.4–30.4 |
| B. Sabah | 9.3 | 7.3–11.8 | 9.5 | 6.8–13.0 | 9.1 | 6.5–12.6 |
| B. Sarawak | 17.2 | 13.0–22.3 | 14.9 | 10.4–21.0 | 19.3 | 14.3–25.6 |
| Others | 10.2 | 7.5–13.6 | 10.0 | 6.6–14.8 | 10.6 | 6.4–17.0 |
tibble(age_group = c("18-29","30-39","40-49","50-59","60+"), n_total = c(200, 200, 200, 200, 300)) %>%
mutate(male = as.integer(round(.4*n_total)), female = n_total - male) %>%
pivot_longer(male:female, names_to = "gender", values_to = "n_gender") %>%
mutate(malay = as.integer(round(.65*n_gender)), chinese = as.integer(round(.2*n_gender)),
indian = n_gender - malay - chinese) %>%
pivot_longer(malay:indian, names_to = "ethnicity", values_to = "n_ethnic") %>%
uncount(n_ethnic) %>% select(-starts_with("n_")) %>% group_by(age_group) %>%
mutate(age = case_when(age_group == "18-29" ~ sample(18:29, n(), replace = T),
age_group == "30-39" ~ sample(30:39, n(), replace = T),
age_group == "40-49" ~ sample(40:49, n(), replace = T),
age_group == "50-59" ~ sample(50:59, n(), replace = T),
.default = sample(60:90, n(), replace = T))) %>% ungroup() %>%
mutate(dm = c(rep(0, 50), rep(1, 2), rep(0, 15), rep(1, 1), rep(0, 11), rep(1, 1), rep(0, 76),
rep(1, 2), rep(0, 23), rep(1, 1), rep(0, 17), rep(1, 1), rep(0, 48), rep(1, 4), rep(0, 15),
rep(1, 1), rep(0, 11), rep(1, 1), rep(0, 73), rep(1, 5), rep(0, 23), rep(1, 1), rep(0, 16),
rep(1, 2), rep(0, 45), rep(1, 7), rep(0, 14), rep(1, 2), rep(0, 9), rep(1, 3), rep(0, 65),
rep(1, 13), rep(0, 20), rep(1, 4), rep(0, 13), rep(1, 5), rep(0, 37), rep(1, 15),
rep(0, 12), rep(1, 4), rep(0, 6), rep(1, 6), rep(0, 55), rep(1, 23), rep(0, 18),
rep(1, 6), rep(0, 9), rep(1, 9), rep(0, 49), rep(1, 29), rep(0, 16), rep(1, 8), rep(0, 7),
rep(1, 11), rep(0, 72), rep(1, 45), rep(0, 23), rep(1, 13), rep(0, 10), rep(1, 17)),
adw = 20) # adjusted design weight = 20, assume dw = 20 and RR = 100% for all| Characteristic | N = 1,1001 |
|---|---|
| dm | |
| No DM | 858 (78.0%) |
| DM | 242 (22.0%) |
| 1 n (%) | |
| Age Group | Sex | Ethnicity | Sample Count (n) | Init. Est. Pop. | Malaysia Population ('000) | Post-strat Factor |
|---|---|---|---|---|---|---|
| 18-29 | male | malay | 52 | 1040 | 1910.68 | 1.8371923 |
| 18-29 | male | indian | 12 | 240 | 201.46 | 0.8394167 |
| 18-29 | female | malay | 78 | 1560 | 1790.28 | 1.1476154 |
| 18-29 | female | indian | 18 | 360 | 188.38 | 0.5232778 |
| 40-49 | male | malay | 52 | 1040 | 1232.30 | 1.1849038 |
| 40-49 | male | indian | 12 | 240 | 161.50 | 0.6729167 |
| 40-49 | female | malay | 78 | 1560 | 1203.60 | 0.7715385 |
| 40-49 | female | indian | 18 | 360 | 155.50 | 0.4319444 |
| 60+ | male | malay | 78 | 1560 | 982.90 | 0.6300641 |
| 60+ | male | indian | 18 | 360 | 129.20 | 0.3588889 |
| 60+ | female | malay | 117 | 2340 | 1064.90 | 0.4550855 |
| 60+ | female | indian | 27 | 540 | 149.60 | 0.2770370 |
| Age Group | Sex | Ethnicity | Sample Count (n) | Init. Est. Pop. | Malaysia Population ('000) | Post-strat. Factor |
|---|---|---|---|---|---|---|
| 18-29 | male | malay | 52 | 1040 | 1910.68 | 1.8371923 |
| 30-39 | male | malay | 52 | 1040 | 1419.40 | 1.3648077 |
| 40-49 | male | malay | 52 | 1040 | 1232.30 | 1.1849038 |
| 50-59 | male | malay | 52 | 1040 | 814.80 | 0.7834615 |
| 60+ | male | malay | 78 | 1560 | 982.90 | 0.6300641 |
| 18-29 | female | malay | 78 | 1560 | 1790.28 | 1.1476154 |
| 30-39 | female | malay | 78 | 1560 | 1419.10 | 0.9096795 |
| 40-49 | female | malay | 78 | 1560 | 1203.60 | 0.7715385 |
| 50-59 | female | malay | 78 | 1560 | 828.40 | 0.5310256 |
| 60+ | female | malay | 117 | 2340 | 1064.90 | 0.4550855 |
| Sex | Ethnicity | Age Group | Sample Count (n) | Init. Est. Pop. | Malaysia Population ('000) | Post-strat. Factor |
|---|---|---|---|---|---|---|
| male | malay | 18-29 | 52 | 1040 | 1910.68 | 1.8371923 |
| female | malay | 18-29 | 78 | 1560 | 1790.28 | 1.1476154 |
| male | malay | 40-49 | 52 | 1040 | 1232.30 | 1.1849038 |
| female | malay | 40-49 | 78 | 1560 | 1203.60 | 0.7715385 |
| male | malay | 60+ | 78 | 1560 | 982.90 | 0.6300641 |
| female | malay | 60+ | 117 | 2340 | 1064.90 | 0.4550855 |
| Characteristic |
Crude (Unweighted)
|
Weighted (Post-Stratified)
|
|---|---|---|
| DM N = 2421 |
DM N = 3,2691 |
|
| my | ||
| Overall | 242 (22.0%) | 3,269 (16.7%) |
| gender | ||
| female | 147 (22.3%) | 1,672 (17.2%) |
| male | 95 (21.6%) | 1,597 (16.2%) |
| age_group | ||
| 18-29 | 8 (4.0%) | 211 (4.0%) |
| 30-39 | 14 (7.0%) | 284 (6.8%) |
| 40-49 | 34 (17.0%) | 609 (15.8%) |
| 50-59 | 63 (31.5%) | 816 (29.5%) |
| 60+ | 123 (41.0%) | 1,349 (38.8%) |
| 1 n (%) | ||
Complex Sampling Design | NHMS | R Conference 2025